library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(modelr)
library(viridis)
## Loading required package: viridisLite
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(latex2exp)
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.2.3
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(broom.mixed)
## Warning: package 'broom.mixed' was built under R version 4.2.3
library(jtools)
dirname <- "/Users/nathanielimel/uci/projects/citesim/src/notebooks/"
get_data_v <- function(vectorizer) {
  fp <- paste(dirname, vectorizer, "_main_data.csv", sep="")
  v_data <- read_csv(fp)
  v_data$vectorizer <- vectorizer
  return(v_data)
}

df_all <- rbind(
  get_data_v("SciBERT"),
  get_data_v("SBERT"),
  get_data_v("GPT2"),
  get_data_v("Word2Vec"),
  get_data_v("BOW")
)
## Rows: 2375 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2302 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4112 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2790 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2649 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): field
## dbl (13): density_bin, freq, count, log_cpy entropy, log_cpy variance, log_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# df_all <- read_csv("/Users/nathanielimel/uci/projects/citesim/src/notebooks/SBERT_SciBERT_GPT2_main_data.csv")
head(df_all)
## # A tibble: 6 × 15
##   densit…¹    freq count log_c…² log_c…³ log_c…⁴ cpy_var cpy_med cpy_m…⁵ ref_med
##      <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1     29.8 1.08e-3    12      NA   0.434   0.660    89.9    2.37    7.00    69.5
## 2     29.9 9.86e-4    11      NA   0.578   0.827   526.     6      14.2     42  
## 3     30.4 1.08e-3    12      NA   0.440   0.802  1346.     6.06   17.4     60  
## 4     30.5 1.08e-3    12      NA   0.383   0.420    28.8    2.28    4.48    46.5
## 5     30.7 1.34e-3    15      NA   0.361   0.684    33.7    1.73    5.38    53  
## 6     30.9 1.08e-3    12      NA   0.652   0.389  1710.     1.68   19.8     59.5
## # … with 5 more variables: ref_var <dbl>, year_med <dbl>, field <chr>,
## #   log_cpy_var <dbl>, vectorizer <chr>, and abbreviated variable names
## #   ¹​density_bin, ²​`log_cpy entropy`, ³​`log_cpy variance`, ⁴​`log_cpy median`,
## #   ⁵​cpy_mean
get_anova <- function(y, vectorizer_name) {
  
  df_vectorizer <- df_all %>% filter(
    vectorizer == vectorizer_name
  ) %>% mutate(
    # z-scale
    y_z = scale(.data[[y]]),
    density_bin_z = scale(density_bin),
    year_med_z = scale(year_med)
  )
  
  # Model 4: age, density, and random density slopes and intercepts
  label_4_both <- paste(
    # y,
    # " ~ year_med + density_bin + (1 + density_bin | field)"
    "y_z ~ year_med_z + density_bin_z + (1 + density_bin_z | field)"
  )
  fm_4_both <- as.formula(label_4_both)
  model_4_both <- lmer(
    fm_4_both, 
    data=df_vectorizer, 
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )
  
  # Model 3: density and random intercepts
  label_3_rho <- paste(
    # y,
    # "~ density_bin + (1 | field)"
    "y_z ~ density_bin_z + (1 | field)"
  )
  fm_3_rho <- as.formula(label_3_rho)
  model_3_rho <- lmer(
    fm_3_rho,
    data=df_vectorizer, 
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )
  
  # Model 2: age and random intercepts
  label_2_age <- paste(
    # y,
    # " ~ year_med + (1 | field)"
    "y_z ~ year_med_z + (1 | field)"
  )
  fm_2_age <- as.formula(label_2_age)
  model_2_age <- lmer(fm_2_age, data=df_vectorizer)
  
  
  # Model 1: intercept only (mean)
  label_1_int <- paste(
    # y,
    # " ~ 1"
    "y_z ~ 1"
  )
  fm_1_int <- as.formula(label_1_int)
  model_1_int <- lm(fm_1_int, data=df_vectorizer)
  
  results <- anova(
    model_4_both,
    model_3_rho,
    model_2_age,
    model_1_int
  )
  
  results_df <- tidy(results)
  
  results_model_4_both <- results_df %>% filter(
    term == "model_4_both",
  )
  
  results_annotated <- results_df %>% mutate(
    bic_gain = results$BIC - results_model_4_both$BIC,
    vectorizer = vectorizer_name,
    predictor = y,
    # Always check that the results are actually ordered in this way...should use a dict but oh well
    # fms = c(label_4_both, label_3_rho, label_2_age, label_1_int),
    # fm = c(label_1_int, label_2_age, label_3_rho, label_4_both),
  )
  print(summary(model_4_both))
  print("------------------------------")
  # print(results)
  
  return(results_annotated)
}
# test the function
# get_anova("log_cpy_var", "SBERT")
# get_anova("cpy_med", "SBERT")
# get_anova("log_cpy_var", "SciBERT")
# get_anova("cpy_med", "SciBERT")
# get_anova("log_cpy_var", "GPT2")
# get_anova("log_cpy_var", "Word2Vec")
get_anova("log_cpy_var", "BOW")
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 6432.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8108 -0.6673 -0.1173  0.5447  7.0614 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr 
##  field    (Intercept)   0.2393   0.4892        
##           density_bin_z 0.0608   0.2466   -0.42
##  Residual               0.6489   0.8056        
## Number of obs: 2649, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)     -0.03158    0.17588    7.07875  -0.180   0.8625  
## year_med_z       0.01213    0.02194 1778.76178   0.553   0.5802  
## density_bin_z   -0.24505    0.09675    6.35506  -2.533   0.0424 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.051       
## densty_bn_z -0.321 -0.172
## [1] "------------------------------"
## # A tibble: 4 × 12
##   term          npar   AIC   BIC logLik devia…¹ stati…²    df    p.value bic_g…³
##   <chr>        <dbl> <dbl> <dbl>  <dbl>   <dbl>   <dbl> <dbl>      <dbl>   <dbl>
## 1 model_1_int      2 7521. 7532. -3758.   7517.    NA      NA NA         1055.  
## 2 model_3_rho      4 6455. 6479. -3224.   6447.  1069.      2  6.55e-233    1.62
## 3 model_2_age      4 6496. 6520. -3244.   6488.     0       0 NA           42.6 
## 4 model_4_both     7 6436. 6477. -3211.   6422.    66.3     3  2.67e- 14    0   
## # … with 2 more variables: vectorizer <chr>, predictor <chr>, and abbreviated
## #   variable names ¹​deviance, ²​statistic, ³​bic_gain
df_plot <- rbind(
  get_anova("log_cpy_var", "SciBERT"),
  get_anova("log_cpy_var", "SBERT"),
  get_anova("cpy_med", "SciBERT"),
  get_anova("cpy_med", "SBERT")
) %>% filter(
  term != "model_4_both",
  term != "model_1_int",
)
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5254.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8654 -0.5932 -0.0719  0.4756  7.9005 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.19106  0.4371       
##           density_bin_z 0.05907  0.2431   0.60
##  Residual               0.51870  0.7202       
## Number of obs: 2375, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.02079    0.14695    8.05646  -0.142  0.89095    
## year_med_z       0.03674    0.02344 2328.71793   1.567  0.11718    
## density_bin_z   -0.46196    0.08412    8.66008  -5.492  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.024       
## densty_bn_z  0.582 -0.174
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5561.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6399 -0.6302 -0.0828  0.4669  9.1671 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.20129  0.4487       
##           density_bin_z 0.01563  0.1250   0.10
##  Residual               0.63986  0.7999       
## Number of obs: 2302, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    5.055e-02  1.530e-01  8.206e+00   0.330 0.749307    
## year_med_z    -4.007e-03  2.690e-02  1.699e+03  -0.149 0.881613    
## density_bin_z -3.171e-01  5.514e-02  1.010e+01  -5.752 0.000178 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.055       
## densty_bn_z  0.133 -0.421
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3697.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3665 -0.4812 -0.1186  0.3347 11.4436 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.60811  0.7798       
##           density_bin_z 0.06356  0.2521   0.68
##  Residual               0.26697  0.5167       
## Number of obs: 2375, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)   -1.253e-01  2.603e-01  8.005e+00  -0.481    0.643  
## year_med_z     9.221e-03  1.691e-02  2.370e+03   0.545    0.586  
## density_bin_z  2.359e-01  8.561e-02  8.299e+00   2.755    0.024 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.010       
## densty_bn_z  0.667 -0.124
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3761.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7080 -0.4459 -0.0888  0.2831 11.3816 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.59315  0.7702       
##           density_bin_z 0.03085  0.1756   0.68
##  Residual               0.28993  0.5385       
## Number of obs: 2302, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)     -0.24693    0.25776    8.05023  -0.958   0.3659  
## year_med_z       0.01639    0.01840 2277.08893   0.891   0.3731  
## density_bin_z    0.12776    0.06389    9.62107   2.000   0.0745 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.024       
## densty_bn_z  0.642 -0.255
## [1] "------------------------------"
(
  ggplot(
    df_plot,
    aes(
      x=bic_gain,
      y=term,
      fill=vectorizer,
    )
  )
  + geom_bar( stat = "identity", position = "dodge")
  # + xlim(-4000, -1000)
  + theme_classic()
  # + scale_y_discrete(position = "right")
  # + scale_fill_manual( values = c("lightblue", "darkblue"))
  # + scale_fill_brewer()
  # + facet_grid("vectorizer ~ predictor")
)

# it's hard to visualize via faceting because it makes significant differences in BIC look insignificant. So let's just try printing 4 separate plots with diff scales?

plot_bic <- function(df_plot) {
  
  df_plot <- df_plot %>% filter(
    # term != "model_4_both",
    term != "model_1_int",
  )
  
  plot <- (
    ggplot(
      df_plot,
      aes(
        x=bic_gain,
        y=term,
      )
    )
    + geom_bar( stat = "identity", position = "dodge")
    + theme_classic()
  )
  
  return(plot)
}

for (predictor in c("cpy_med", "log_cpy_var")){
  for (vectorizer in c("SciBERT", "SBERT")){
    print(plot_bic(
      get_anova(predictor, vectorizer)
    ))
  }
}
## refitting model(s) with ML (instead of REML)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3697.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3665 -0.4812 -0.1186  0.3347 11.4436 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.60811  0.7798       
##           density_bin_z 0.06356  0.2521   0.68
##  Residual               0.26697  0.5167       
## Number of obs: 2375, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)   -1.253e-01  2.603e-01  8.005e+00  -0.481    0.643  
## year_med_z     9.221e-03  1.691e-02  2.370e+03   0.545    0.586  
## density_bin_z  2.359e-01  8.561e-02  8.299e+00   2.755    0.024 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.010       
## densty_bn_z  0.667 -0.124
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3761.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7080 -0.4459 -0.0888  0.2831 11.3816 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.59315  0.7702       
##           density_bin_z 0.03085  0.1756   0.68
##  Residual               0.28993  0.5385       
## Number of obs: 2302, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)     -0.24693    0.25776    8.05023  -0.958   0.3659  
## year_med_z       0.01639    0.01840 2277.08893   0.891   0.3731  
## density_bin_z    0.12776    0.06389    9.62107   2.000   0.0745 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.024       
## densty_bn_z  0.642 -0.255
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5254.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8654 -0.5932 -0.0719  0.4756  7.9005 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.19106  0.4371       
##           density_bin_z 0.05907  0.2431   0.60
##  Residual               0.51870  0.7202       
## Number of obs: 2375, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.02079    0.14695    8.05646  -0.142  0.89095    
## year_med_z       0.03674    0.02344 2328.71793   1.567  0.11718    
## density_bin_z   -0.46196    0.08412    8.66008  -5.492  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.024       
## densty_bn_z  0.582 -0.174
## [1] "------------------------------"
## refitting model(s) with ML (instead of REML)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5561.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6399 -0.6302 -0.0828  0.4669  9.1671 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.20129  0.4487       
##           density_bin_z 0.01563  0.1250   0.10
##  Residual               0.63986  0.7999       
## Number of obs: 2302, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    5.055e-02  1.530e-01  8.206e+00   0.330 0.749307    
## year_med_z    -4.007e-03  2.690e-02  1.699e+03  -0.149 0.881613    
## density_bin_z -3.171e-01  5.514e-02  1.010e+01  -5.752 0.000178 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.055       
## densty_bn_z  0.133 -0.421
## [1] "------------------------------"

get_summary <- function(y, vectorizer_name) {
  
  df_vectorizer <- df_all %>% filter(
    vectorizer == vectorizer_name
  ) %>% mutate(
    # z-scale
    y_z = scale(.data[[y]]),
    density_bin_z = scale(density_bin),
    year_med_z = scale(year_med)
  )
  
  # Model 4: age, density, and random density slopes and intercepts
  label_4_both <- paste(
    # y,
    # " ~ year_med + density_bin + (1 + density_bin | field)"
    "y_z ~ year_med_z + density_bin_z + (1 + density_bin_z | field)"
  )
  fm_4_both <- as.formula(label_4_both)
  model_4_both <- lmer(
    fm_4_both, 
    data=df_vectorizer, 
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )
  print(summary(model_4_both))
  
  return(model_4_both)
  
}

tidy(get_summary("log_cpy_var", "SciBERT"))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_4_both
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5254.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8654 -0.5932 -0.0719  0.4756  7.9005 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.19106  0.4371       
##           density_bin_z 0.05907  0.2431   0.60
##  Residual               0.51870  0.7202       
## Number of obs: 2375, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.02079    0.14695    8.05646  -0.142  0.89095    
## year_med_z       0.03674    0.02344 2328.71793   1.567  0.11718    
## density_bin_z   -0.46196    0.08412    8.66008  -5.492  0.00044 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) yr_md_
## year_med_z  -0.024       
## densty_bn_z  0.582 -0.174
## # A tibble: 7 × 8
##   effect   group    term                estim…¹ std.e…² stati…³      df  p.value
##   <chr>    <chr>    <chr>                 <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
## 1 fixed    <NA>     (Intercept)         -0.0208  0.147   -0.142    8.06  8.91e-1
## 2 fixed    <NA>     year_med_z           0.0367  0.0234   1.57  2329.    1.17e-1
## 3 fixed    <NA>     density_bin_z       -0.462   0.0841  -5.49     8.66  4.40e-4
## 4 ran_pars field    sd__(Intercept)      0.437  NA       NA       NA    NA      
## 5 ran_pars field    cor__(Intercept).d…  0.602  NA       NA       NA    NA      
## 6 ran_pars field    sd__density_bin_z    0.243  NA       NA       NA    NA      
## 7 ran_pars Residual sd__Observation      0.720  NA       NA       NA    NA      
## # … with abbreviated variable names ¹​estimate, ²​std.error, ³​statistic
getrefs <- function(y, vectorizer_name, print_summary=TRUE) {
  
  df_vectorizer <- df_all %>% filter(
    vectorizer == vectorizer_name
  ) %>% mutate(
    # z-scale
    y_z = scale(.data[[y]]),
    density_bin_z = scale(density_bin),
    year_med_z = scale(year_med),
    ref_med_z = scale(ref_med),
    freq_z = scale(freq),
  ) %>% filter(
    y_z < 3,
    y_z > -3,
    density_bin_z < 3,
    density_bin_z > -3,
    ref_med_z < 3,
    ref_med_z > -3,
    freq_z < 3,
    freq_z > -3
  )
  
  # Model 5: 
  label_5 <- paste(
    # "y_z ~ year_med_z + density_bin_z + (1 | field)"
    # "y_z ~ ref_med_z + year_med_z + density_bin_z + (1 | field)"
    # "y_z ~ ref_med_z + year_med_z + density_bin_z + (1 + density_bin_z + ref_med_z | field)"
    "y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 + density_bin_z | field)"
    # "y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 | field)"    
  )
  fm_5 <- as.formula(label_5)
  model_5 <- lmer(
    fm_5, 
    data=df_vectorizer, 
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )
  
  if (print_summary) {
    print(summary(model_5))    
  }
  
  # model_ref_rho <- lmer(
  #   ref_med_z ~ density_bin_z + (1 + density_bin_z | field), 
  #   data=df_vectorizer,
  #   control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  # )
  # 
  # print(summary(model_ref_rho))

  return(model_5)
  
}

# summary(getrefs("log_cpy_var", "GPT2", FALSE))
getrefs("log_cpy_var", "SciBERT", FALSE)
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 4653.778
## Random effects:
##  Groups   Name          Std.Dev. Corr
##  field    (Intercept)   0.4310       
##           density_bin_z 0.2200   0.63
##  Residual               0.6541       
## Number of obs: 2299, groups:  field, 9
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##      -0.03233       -0.03303        0.03197       -0.02746       -0.40171
# sigs_dfs <- tibble()
# 
# for (vec in unique(df_all$vectorizer)) {
#   sigs_df <- tidy(getrefs(y, vec, FALSE)) %>% filter(
#     effect == "fixed",
#     term != "(Intercept)"
#   ) %>% select(
#     term,
#     estimate,
#     p.value,
#   ) %>% mutate(
#     model = vec
#   )
#   sigs_dfs <- rbind(sigs_dfs, sigs_df)
# 
# }
# 
# sigs_dfs <- sigs_dfs %>% mutate(
#   significance = `p.value` < 0.05
# )
# sigs_dfs
for (y in c("cpy_med", "log_cpy_var")) {
  print(
    
   plot_summs(
      getrefs(y, "SciBERT",),
      getrefs(y, "SBERT"),
      getrefs(y, "GPT2"),
      getrefs(y, "Word2Vec"),
      getrefs(y, "BOW"),
      model.names = c("SciBERT", "SBERT", "GPT2", "Word2Vec", "BOW"),
      # model.names = c("SciBERT", "SBERT", "GPT2"),
      colors = "Rainbow",
      point.shape = FALSE,
      robust=TRUE, 
      inner_ci_level = .9
      # plot.distributions = TRUE
    )
      # + scale_color_viridis(discrete=TRUE)
    # + scale_color_manual( values = c("blue", "red"))
    + theme(
      axis.text.x = element_text(size=20)
    )
    # Add shapes for significance

  )  
  
}
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 2674.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6905 -0.5340 -0.1130  0.4452  4.6943 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.44155  0.6645       
##           density_bin_z 0.05775  0.2403   0.54
##  Residual               0.17929  0.4234       
## Number of obs: 2293, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -1.079e-01  2.220e-01  7.989e+00  -0.486   0.6399    
## ref_med_z      1.512e-01  1.829e-02  2.288e+03   8.266 2.32e-16 ***
## year_med_z    -7.479e-03  1.436e-02  2.286e+03  -0.521   0.6025    
## freq_z         1.117e-02  1.888e-02  2.280e+03   0.592   0.5542    
## density_bin_z  1.950e-01  8.142e-02  8.292e+00   2.394   0.0425 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.024                     
## year_med_z  -0.017 -0.173              
## freq_z      -0.015  0.005  0.105       
## densty_bn_z  0.530 -0.034 -0.103 -0.038
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 2701.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9371 -0.5057 -0.0747  0.3894  5.9759 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.44247  0.6652       
##           density_bin_z 0.01275  0.1129   0.60
##  Residual               0.18860  0.4343       
## Number of obs: 2236, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -2.219e-01  2.230e-01  8.073e+00  -0.995    0.349    
## ref_med_z      1.435e-01  2.086e-02  1.718e+03   6.878 8.46e-12 ***
## year_med_z     7.255e-03  1.528e-02  2.197e+03   0.475    0.635    
## freq_z        -6.840e-02  4.327e-02  2.226e+03  -1.581    0.114    
## density_bin_z  5.579e-02  4.346e-02  1.057e+01   1.284    0.227    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.030                     
## year_med_z  -0.028 -0.172              
## freq_z      -0.011  0.010 -0.003       
## densty_bn_z  0.535 -0.143 -0.271 -0.002
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5886
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9412 -0.5544 -0.1130  0.4424  5.8618 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr 
##  field    (Intercept)   0.49398  0.7028        
##           density_bin_z 0.01357  0.1165   -0.33
##  Residual               0.24985  0.4999        
## Number of obs: 3981, groups:  field, 9
## 
## Fixed effects:
##                Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   2.354e-02  2.344e-01 7.984e+00   0.100   0.9225    
## ref_med_z     1.445e-01  1.396e-02 3.973e+03  10.346   <2e-16 ***
## year_med_z    2.255e-02  1.039e-02 3.972e+03   2.170   0.0300 *  
## freq_z        1.159e-02  9.363e-03 3.968e+03   1.237   0.2160    
## density_bin_z 9.694e-02  4.007e-02 8.230e+00   2.419   0.0411 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z   -0.001                     
## year_med_z   0.001 -0.341              
## freq_z       0.001 -0.010 -0.041       
## densty_bn_z -0.318  0.033 -0.054 -0.088
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3704.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9179 -0.5408 -0.0782  0.3886  5.4505 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.50971  0.7139       
##           density_bin_z 0.01459  0.1208   0.38
##  Residual               0.22268  0.4719       
## Number of obs: 2697, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.15774    0.23824    7.96916  -0.662   0.5266    
## ref_med_z        0.11736    0.02164 2382.66994   5.423 6.47e-08 ***
## year_med_z       0.01173    0.01630 2551.20271   0.720   0.4718    
## freq_z          -0.01933    0.01129 2682.29647  -1.713   0.0868 .  
## density_bin_z   -0.03373    0.04454   10.16069  -0.757   0.4660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.010                     
## year_med_z  -0.001 -0.146              
## freq_z      -0.004  0.018 -0.053       
## densty_bn_z  0.339 -0.210 -0.245 -0.004
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3905.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5459 -0.5489 -0.0859  0.4321  5.8348 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.5847   0.7646       
##           density_bin_z 0.0170   0.1304   0.70
##  Residual               0.2675   0.5172       
## Number of obs: 2519, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.21031    0.27111    6.95790  -0.776  0.46346    
## ref_med_z        0.12214    0.02114 2291.08060   5.777 8.63e-09 ***
## year_med_z       0.01549    0.01459 1152.08085   1.062  0.28867    
## freq_z          -0.03822    0.01996 2469.16024  -1.915  0.05558 .  
## density_bin_z   -0.19559    0.05392    8.53926  -3.627  0.00602 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.017                     
## year_med_z  -0.020 -0.194              
## freq_z       0.005 -0.068 -0.089       
## densty_bn_z  0.619 -0.096 -0.176  0.082

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 4653.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8335 -0.6259 -0.0553  0.5520  5.1991 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.18578  0.4310       
##           density_bin_z 0.04841  0.2200   0.63
##  Residual               0.42789  0.6541       
## Number of obs: 2299, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.03233    0.14558    7.89619  -0.222 0.829872    
## ref_med_z       -0.03303    0.02801 2016.02733  -1.179 0.238497    
## year_med_z       0.03197    0.02208 2218.18905   1.448 0.147875    
## freq_z          -0.02746    0.02899 2194.71086  -0.947 0.343731    
## density_bin_z   -0.40171    0.07671    8.89641  -5.237 0.000558 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.053                     
## year_med_z  -0.038 -0.168              
## freq_z      -0.032  0.019  0.092       
## densty_bn_z  0.599 -0.058 -0.169 -0.056
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 4933.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8750 -0.6754 -0.0699  0.5681  4.2046 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.205341 0.4531       
##           density_bin_z 0.007956 0.0892   0.24
##  Residual               0.515845 0.7182       
## Number of obs: 2239, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)      0.05704    0.15533    8.12097   0.367 0.722830    
## ref_med_z        0.04669    0.03338  576.65815   1.398 0.162519    
## year_med_z      -0.01986    0.02471 1550.82590  -0.804 0.421697    
## freq_z           0.03613    0.07152 2025.50493   0.505 0.613520    
## density_bin_z   -0.26608    0.04531   10.88059  -5.873 0.000112 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.062                     
## year_med_z  -0.057 -0.166              
## freq_z      -0.018  0.012 -0.014       
## densty_bn_z  0.195 -0.231 -0.414  0.029
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 9401.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9047 -0.6735 -0.1070  0.5424  3.9004 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr 
##  field    (Intercept)   0.15236  0.3903        
##           density_bin_z 0.01323  0.1150   -0.18
##  Residual               0.61230  0.7825        
## Number of obs: 3970, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.10504    0.13083    7.92441  -0.803  0.44545    
## ref_med_z       -0.05524    0.02181 3558.29844  -2.533  0.01136 *  
## year_med_z      -0.08828    0.01618 3879.27605  -5.456 5.18e-08 ***
## freq_z           0.04531    0.01460 3959.59851   3.104  0.00192 ** 
## density_bin_z   -0.07805    0.04129    8.64592  -1.890  0.09261 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z   -0.002                     
## year_med_z   0.003 -0.346              
## freq_z       0.003 -0.009 -0.043       
## densty_bn_z -0.172  0.051 -0.078 -0.131
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 6221.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6222 -0.6627 -0.1041  0.5764  4.0578 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.2191   0.4680       
##           density_bin_z 0.0429   0.2071   0.02
##  Residual               0.5702   0.7551       
## Number of obs: 2696, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)      0.06713    0.15705    7.94359   0.427   0.6804  
## ref_med_z       -0.02488    0.03458 2338.14436  -0.719   0.4719  
## year_med_z      -0.04334    0.02598 2480.56694  -1.668   0.0954 .
## freq_z           0.02210    0.01815 2685.82851   1.218   0.2234  
## density_bin_z   -0.02122    0.07546    9.74491  -0.281   0.7844  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.024                     
## year_med_z  -0.003 -0.151              
## freq_z      -0.010  0.022 -0.048       
## densty_bn_z  0.015 -0.198 -0.229 -0.006
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5872
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9663 -0.6909 -0.0868  0.5984  4.3496 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr 
##  field    (Intercept)   0.23887  0.4887        
##           density_bin_z 0.05737  0.2395   -0.40
##  Residual               0.58292  0.7635        
## Number of obs: 2525, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)  
## (Intercept)   -4.924e-02  1.758e-01  6.885e+00  -0.280   0.7876  
## ref_med_z      5.566e-03  3.088e-02  1.754e+03   0.180   0.8570  
## year_med_z     2.747e-02  2.177e-02  1.595e+03   1.262   0.2072  
## freq_z         7.067e-03  2.951e-02  2.482e+03   0.239   0.8108  
## density_bin_z -2.765e-01  9.469e-02  6.404e+00  -2.920   0.0247 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.038                     
## year_med_z  -0.055 -0.201              
## freq_z       0.015 -0.060 -0.095       
## densty_bn_z -0.307 -0.080 -0.163  0.075

# Looks like number of references is a better predictor of the typical citation rates than density. 
# However, we have very good reason to believe that density causes number of references, not the other way around!


# Create a new df grouped by vectorizer and field (should have done this above, refactor)

df_grouped <- df_all %>%
  group_by(field, vectorizer) %>%
  mutate_at(vars(cpy_med, density_bin, year_med, ref_med),
            list(z = ~scale(.))) %>%
  filter(cpy_med_z < 3,
         cpy_med_z > -3,
         density_bin_z < 3,
         density_bin_z > -3,
         ref_med_z < 3,
         ref_med_z > -3)

# head(df_grouped)
for (vec in unique(df_grouped$vectorizer)) {
  df_vec <- df_grouped %>% filter(
    vectorizer == vec
  )
  plot <- (
    ggplot(
      df_vec,
      aes(
        # x=density_bin_z,
        # y=ref_med_z,
        x=density_bin,
        y=ref_med,
        # color=field,
      )
    )
    + geom_smooth(
      method="lm",
      linewidth=3,
      color="black",
    )
    + geom_point(
      aes(
        color=field,
      ),
      size=0.5,
      alpha=0.2,
    )
    + geom_smooth(
      aes(
        color=field,      
      ),
      # method="lm",
      linewidth=1,
    )
    # + facet_grid("~ vectorizer")
    # + facet_wrap("vectorizer")
    + theme_classic()
    + ggtitle(vec)
    + theme(
      axis.text = element_text(size=20)
    )    
  )
  print(plot)
    
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# ggsave(
#   "refs_by_vectorizers.png",
#   width=12,
#   height=3,
# )
getrefs("cpy_med", "SciBERT", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 2674.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6905 -0.5340 -0.1130  0.4452  4.6943 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.44155  0.6645       
##           density_bin_z 0.05775  0.2403   0.54
##  Residual               0.17929  0.4234       
## Number of obs: 2293, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -1.079e-01  2.220e-01  7.989e+00  -0.486   0.6399    
## ref_med_z      1.512e-01  1.829e-02  2.288e+03   8.266 2.32e-16 ***
## year_med_z    -7.479e-03  1.436e-02  2.286e+03  -0.521   0.6025    
## freq_z         1.117e-02  1.888e-02  2.280e+03   0.592   0.5542    
## density_bin_z  1.950e-01  8.142e-02  8.292e+00   2.394   0.0425 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.024                     
## year_med_z  -0.017 -0.173              
## freq_z      -0.015  0.005  0.105       
## densty_bn_z  0.530 -0.034 -0.103 -0.038
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 2674.497
## Random effects:
##  Groups   Name          Std.Dev. Corr
##  field    (Intercept)   0.6645       
##           density_bin_z 0.2403   0.54
##  Residual               0.4234       
## Number of obs: 2293, groups:  field, 9
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##     -0.107935       0.151210      -0.007479       0.011169       0.194950
getrefs("cpy_med", "SBERT", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 2701.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9371 -0.5057 -0.0747  0.3894  5.9759 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.44247  0.6652       
##           density_bin_z 0.01275  0.1129   0.60
##  Residual               0.18860  0.4343       
## Number of obs: 2236, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -2.219e-01  2.230e-01  8.073e+00  -0.995    0.349    
## ref_med_z      1.435e-01  2.086e-02  1.718e+03   6.878 8.46e-12 ***
## year_med_z     7.255e-03  1.528e-02  2.197e+03   0.475    0.635    
## freq_z        -6.840e-02  4.327e-02  2.226e+03  -1.581    0.114    
## density_bin_z  5.579e-02  4.346e-02  1.057e+01   1.284    0.227    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.030                     
## year_med_z  -0.028 -0.172              
## freq_z      -0.011  0.010 -0.003       
## densty_bn_z  0.535 -0.143 -0.271 -0.002
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 2701.799
## Random effects:
##  Groups   Name          Std.Dev. Corr
##  field    (Intercept)   0.6652       
##           density_bin_z 0.1129   0.60
##  Residual               0.4343       
## Number of obs: 2236, groups:  field, 9
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##     -0.221861       0.143473       0.007255      -0.068404       0.055791
getrefs("cpy_med", "GPT2", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 5886
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9412 -0.5544 -0.1130  0.4424  5.8618 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr 
##  field    (Intercept)   0.49398  0.7028        
##           density_bin_z 0.01357  0.1165   -0.33
##  Residual               0.24985  0.4999        
## Number of obs: 3981, groups:  field, 9
## 
## Fixed effects:
##                Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   2.354e-02  2.344e-01 7.984e+00   0.100   0.9225    
## ref_med_z     1.445e-01  1.396e-02 3.973e+03  10.346   <2e-16 ***
## year_med_z    2.255e-02  1.039e-02 3.972e+03   2.170   0.0300 *  
## freq_z        1.159e-02  9.363e-03 3.968e+03   1.237   0.2160    
## density_bin_z 9.694e-02  4.007e-02 8.230e+00   2.419   0.0411 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z   -0.001                     
## year_med_z   0.001 -0.341              
## freq_z       0.001 -0.010 -0.041       
## densty_bn_z -0.318  0.033 -0.054 -0.088
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 5886.016
## Random effects:
##  Groups   Name          Std.Dev. Corr 
##  field    (Intercept)   0.7028        
##           density_bin_z 0.1165   -0.33
##  Residual               0.4999        
## Number of obs: 3981, groups:  field, 9
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##       0.02354        0.14448        0.02255        0.01159        0.09694
getrefs("cpy_med", "Word2Vec", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3704.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9179 -0.5408 -0.0782  0.3886  5.4505 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.50971  0.7139       
##           density_bin_z 0.01459  0.1208   0.38
##  Residual               0.22268  0.4719       
## Number of obs: 2697, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.15774    0.23824    7.96916  -0.662   0.5266    
## ref_med_z        0.11736    0.02164 2382.66994   5.423 6.47e-08 ***
## year_med_z       0.01173    0.01630 2551.20271   0.720   0.4718    
## freq_z          -0.01933    0.01129 2682.29647  -1.713   0.0868 .  
## density_bin_z   -0.03373    0.04454   10.16069  -0.757   0.4660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.010                     
## year_med_z  -0.001 -0.146              
## freq_z      -0.004  0.018 -0.053       
## densty_bn_z  0.339 -0.210 -0.245 -0.004
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 3704.345
## Random effects:
##  Groups   Name          Std.Dev. Corr
##  field    (Intercept)   0.7139       
##           density_bin_z 0.1208   0.38
##  Residual               0.4719       
## Number of obs: 2697, groups:  field, 9
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##      -0.15774        0.11736        0.01173       -0.01933       -0.03373
getrefs("cpy_med", "BOW", TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: fm_5
##    Data: df_vectorizer
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3905.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5459 -0.5489 -0.0859  0.4321  5.8348 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  field    (Intercept)   0.5847   0.7646       
##           density_bin_z 0.0170   0.1304   0.70
##  Residual               0.2675   0.5172       
## Number of obs: 2519, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.21031    0.27111    6.95790  -0.776  0.46346    
## ref_med_z        0.12214    0.02114 2291.08060   5.777 8.63e-09 ***
## year_med_z       0.01549    0.01459 1152.08085   1.062  0.28867    
## freq_z          -0.03822    0.01996 2469.16024  -1.915  0.05558 .  
## density_bin_z   -0.19559    0.05392    8.53926  -3.627  0.00602 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.017                     
## year_med_z  -0.020 -0.194              
## freq_z       0.005 -0.068 -0.089       
## densty_bn_z  0.619 -0.096 -0.176  0.082
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: fm_5
##    Data: df_vectorizer
## REML criterion at convergence: 3905.925
## Random effects:
##  Groups   Name          Std.Dev. Corr
##  field    (Intercept)   0.7646       
##           density_bin_z 0.1304   0.70
##  Residual               0.5172       
## Number of obs: 2519, groups:  field, 8
## Fixed Effects:
##   (Intercept)      ref_med_z     year_med_z         freq_z  density_bin_z  
##      -0.21031        0.12214        0.01549       -0.03822       -0.19559
# Create a new df grouped by vectorizer and field (should have done this above, refactor)

df_grouped <- df_all %>%
  group_by(field, vectorizer) %>%
  mutate_at(vars(cpy_med, density_bin, year_med, ref_med),
            list(z = ~scale(.))) %>%
  filter(cpy_med_z < 3,
         cpy_med_z > -3,
         density_bin_z < 3,
         density_bin_z > -3,
         ref_med_z < 3,
         ref_med_z > -3,
         # year_med_z < 3,
         # year_med_z > -3
        )

# head(df_grouped)

# drop gpt for now

for (vec in unique(df_grouped$vectorizer)) {
  df_vec <- df_grouped %>% filter(
    vectorizer == vec
  )
  
  plot <- (
  ggplot(
    df_vec,
    aes(
      # x=year_med_z,
      # y=density_bin_z,
      # What happens if we don't consider the z-score? Then GPT screws stuff up
      x=year_med,
      y=density_bin,
      # color=field,
    )
  )
  + geom_smooth(
    method="lm",
    linewidth=3,
    color="black",
  )
  + geom_point(
    aes(
      color=field,
    ),
    size=0.5,
    alpha=0.2,
  )
  + geom_smooth(
    aes(
      color=field,      
    ),
    # method="lm",
    linewidth=1,
  )
  # + facet_grid("~ vectorizer")
  # + facet_wrap("vectorizer")
  + theme_classic()
  + ggtitle(vec)
  + theme(
    axis.text = element_text(size=20)
  )
)
  print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# 
# ggsave(
#   "rho_vs_year_by_vectorizers.png",
#   width=12,
#   height=3,
# )
# NOTE: need to loop over y var too!
for (y in c("cpy_med", "log_cpy_var")){
for (vec in unique(df_grouped$vectorizer)) {

  df_vec <- df_all %>% filter(
    vectorizer == vec
  ) %>% mutate(
    # z-scale
    y_z = scale(.data[[y]]),
    density_bin_z = scale(density_bin),
    year_med_z = scale(year_med),
    ref_med_z = scale(ref_med),
    freq_z = scale(freq),
  ) %>% filter(
    y_z < 3,
    y_z > -3,
    density_bin_z < 3,
    density_bin_z > -3,
    ref_med_z < 3,
    ref_med_z > -3
  )
  
  main_model <- lmer(
    y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1  | field),
    data=df_vec, 
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )  
  
  ablated_model <- lmer(
    y_z ~ ref_med_z + year_med_z + freq_z + (1 | field),
    data=df_vec,
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))
  )
  
  simpler_model <- lmer(
    y_z ~ density_bin_z + (1 | field),
    data=df_vec,
    control=lmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5))    
  )

  print("--------------------")
  print(paste(vec, "x", y))
  print("--------------------")
  # print(anova(
  #   main_model,
  #   ablated_model
  # ))
  # print("\n")
  
  # print(summary(simpler_model))
  print(summary(main_model))
  
}
}
## [1] "--------------------"
## [1] "SciBERT x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3179.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.8824 -0.5529 -0.0661  0.4726  4.5826 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.4221   0.6497  
##  Residual             0.2223   0.4715  
## Number of obs: 2325, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -1.405e-01  2.171e-01  7.974e+00  -0.647  0.53568    
## ref_med_z      1.048e-01  1.887e-02  2.319e+03   5.557 3.05e-08 ***
## year_med_z     4.183e-02  1.469e-02  2.319e+03   2.848  0.00444 ** 
## freq_z        -1.843e-03  1.478e-02  2.303e+03  -0.125  0.90076    
## density_bin_z  1.875e-01  1.426e-02  2.317e+03  13.147  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.019                     
## year_med_z  -0.015 -0.166              
## freq_z      -0.035  0.027  0.116       
## densty_bn_z  0.012 -0.199 -0.539 -0.237
## [1] "--------------------"
## [1] "SBERT x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 2771.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0400 -0.5073 -0.0726  0.3905  6.2222 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.4047   0.6361  
##  Residual             0.1927   0.4389  
## Number of obs: 2263, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -1.786e-01  2.129e-01  8.062e+00  -0.839    0.426    
## ref_med_z      2.100e-01  1.856e-02  2.258e+03  11.317   <2e-16 ***
## year_med_z     1.679e-02  1.481e-02  2.258e+03   1.134    0.257    
## freq_z        -1.699e-03  1.234e-02  2.229e+03  -0.138    0.890    
## density_bin_z  1.667e-02  1.806e-02  2.257e+03   0.924    0.356    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.023                     
## year_med_z  -0.023 -0.217              
## freq_z      -0.050  0.021  0.014       
## densty_bn_z  0.015 -0.245 -0.570 -0.036
## [1] "--------------------"
## [1] "GPT2 x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 6086.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0228 -0.5643 -0.1108  0.4398  5.8682 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.4641   0.6812  
##  Residual             0.2608   0.5107  
## Number of obs: 4017, groups:  field, 9
## 
## Fixed effects:
##                Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   3.505e-03  2.272e-01 7.986e+00   0.015    0.988    
## ref_med_z     1.308e-01  1.394e-02 4.012e+03   9.387   <2e-16 ***
## year_med_z    1.189e-02  1.048e-02 4.010e+03   1.134    0.257    
## freq_z        1.181e-03  8.980e-03 4.004e+03   0.132    0.895    
## density_bin_z 1.295e-01  9.514e-03 4.007e+03  13.607   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z   -0.001                     
## year_med_z   0.001 -0.348              
## freq_z      -0.001 -0.028 -0.043       
## densty_bn_z -0.001  0.155 -0.195 -0.354
## [1] "--------------------"
## [1] "Word2Vec x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3805
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9918 -0.5621 -0.0729  0.3970  5.6923 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.4590   0.6775  
##  Residual             0.2281   0.4776  
## Number of obs: 2739, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.12863    0.22603    7.98175  -0.569    0.585    
## ref_med_z        0.19219    0.01917 2733.85142  10.024  < 2e-16 ***
## year_med_z       0.00393    0.01512 2733.12028   0.260    0.795    
## freq_z          -0.02472    0.01044 2729.12118  -2.367    0.018 *  
## density_bin_z   -0.06946    0.01718 2730.13329  -4.043 5.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.008                     
## year_med_z   0.001 -0.180              
## freq_z      -0.006  0.006 -0.047       
## densty_bn_z -0.004 -0.422 -0.565 -0.010
## [1] "--------------------"
## [1] "BOW x cpy_med"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 3995.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7436 -0.5248 -0.0816  0.4096  6.3947 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.5732   0.7571  
##  Residual             0.2630   0.5128  
## Number of obs: 2611, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)   -1.762e-01  2.680e-01  6.976e+00  -0.658   0.5318    
## ref_med_z      1.410e-01  2.044e-02  2.604e+03   6.899 6.56e-12 ***
## year_med_z     6.554e-03  1.341e-02  2.603e+03   0.489   0.6250    
## freq_z        -2.925e-02  1.327e-02  2.606e+03  -2.205   0.0275 *  
## density_bin_z -1.712e-01  1.981e-02  2.606e+03  -8.642  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.017                     
## year_med_z  -0.001 -0.208              
## freq_z      -0.015 -0.064 -0.082       
## densty_bn_z -0.002 -0.242 -0.160  0.123
## [1] "--------------------"
## [1] "SciBERT x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 4903.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9213 -0.6187 -0.0704  0.5328  4.9642 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.1820   0.4266  
##  Residual             0.4675   0.6837  
## Number of obs: 2331, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)      0.01470    0.14379    7.88232   0.102  0.92113    
## ref_med_z       -0.04561    0.02733 2139.36161  -1.669  0.09534 .  
## year_med_z      -0.04086    0.02114 2291.48303  -1.933  0.05338 .  
## freq_z          -0.06035    0.02137 1619.40503  -2.825  0.00479 ** 
## density_bin_z   -0.30218    0.02076 2321.15025 -14.557  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.041                     
## year_med_z  -0.032 -0.167              
## freq_z      -0.076  0.039  0.107       
## densty_bn_z  0.023 -0.207 -0.541 -0.215
## [1] "--------------------"
## [1] "SBERT x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 4975.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0497 -0.6652 -0.0509  0.5603  4.2597 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.1743   0.4175  
##  Residual             0.5131   0.7163  
## Number of obs: 2266, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    5.475e-02  1.425e-01  8.211e+00   0.384   0.7105    
## ref_med_z      5.184e-02  3.000e-02  2.157e+03   1.728   0.0842 .  
## year_med_z    -1.764e-02  2.374e-02  2.086e+03  -0.743   0.4574    
## freq_z         6.144e-03  1.981e-02  1.256e+03   0.310   0.7564    
## density_bin_z -2.752e-01  2.938e-02  2.202e+03  -9.367   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.050                     
## year_med_z  -0.049 -0.207              
## freq_z      -0.112  0.034 -0.005       
## densty_bn_z  0.029 -0.265 -0.562 -0.016
## [1] "--------------------"
## [1] "GPT2 x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 9524.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9929 -0.6787 -0.1161  0.5348  4.0942 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.1470   0.3835  
##  Residual             0.6202   0.7875  
## Number of obs: 4008, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)     -0.09058    0.12844    7.93687  -0.705  0.50086    
## ref_med_z       -0.03980    0.02146 3609.80797  -1.854  0.06375 .  
## year_med_z      -0.08804    0.01611 3951.11446  -5.466 4.90e-08 ***
## freq_z           0.05186    0.01378 3997.93074   3.764  0.00017 ***
## density_bin_z   -0.08760    0.01471 4001.39249  -5.953 2.86e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z   -0.002                     
## year_med_z   0.002 -0.352              
## freq_z      -0.003 -0.026 -0.044       
## densty_bn_z -0.004  0.154 -0.192 -0.352
## [1] "--------------------"
## [1] "Word2Vec x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 6374.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2903 -0.6572 -0.0841  0.5701  3.8504 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.2574   0.5073  
##  Residual             0.5867   0.7660  
## Number of obs: 2738, groups:  field, 9
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)   
## (Intercept)   -7.979e-03  1.698e-01  7.959e+00  -0.047   0.9637   
## ref_med_z     -9.074e-03  3.054e-02  2.666e+03  -0.297   0.7664   
## year_med_z    -5.944e-03  2.413e-02  2.698e+03  -0.246   0.8054   
## freq_z         1.871e-02  1.683e-02  2.733e+03   1.112   0.2663   
## density_bin_z -8.373e-02  2.766e-02  2.731e+03  -3.027   0.0025 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.016                     
## year_med_z   0.001 -0.188              
## freq_z      -0.014  0.009 -0.046       
## densty_bn_z -0.007 -0.419 -0.563 -0.010
## [1] "--------------------"
## [1] "BOW x log_cpy_var"
## [1] "--------------------"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: y_z ~ ref_med_z + year_med_z + freq_z + density_bin_z + (1 |  
##     field)
##    Data: df_vec
## Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
## REML criterion at convergence: 6044
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7600 -0.6873 -0.0779  0.5890  4.2408 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  field    (Intercept) 0.2609   0.5108  
##  Residual             0.5765   0.7593  
## Number of obs: 2617, groups:  field, 8
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)      0.03425    0.18153    6.92650   0.189   0.8558    
## ref_med_z        0.03151    0.02994 2371.48232   1.053   0.2926    
## year_med_z       0.00572    0.01982 2605.87273   0.289   0.7730    
## freq_z          -0.03525    0.01962 2557.28915  -1.797   0.0724 .  
## density_bin_z   -0.20754    0.02900 2504.93731  -7.157 1.08e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) rf_md_ yr_md_ freq_z
## ref_med_z    0.036                     
## year_med_z  -0.003 -0.215              
## freq_z      -0.034 -0.056 -0.081       
## densty_bn_z -0.004 -0.245 -0.155  0.116